Isolating cell subsets with scGate: a de novo analysis

Preliminars

Background

scGate is an open source tool designed for gating specific and uncontaminated cell types in a scRNAseq experiment. We aim to gate cell subtypes at high precision/PPV (i.e achieving high cell type purity) and we don’t care about some loss of true cells (i.e. sensitivity > 90% is ideal). We aim at a reliable tool to purify individual cell types from scRNA-seq data from complex samples (e.g. whole tissue, whole tumor) to enable i) downstream cell type-specific analysis (e.g. reference-atlas projection) and ii) cross-study comparisons/meta-analysis of cell types.

This demo

In previous vignettes (tcell.html and tced8.html), we show basic functionality and usage of scGate package over well annotated datasets. In this demo, we present a more realistic situation, namely, when cell types are not known a priori and you are interested in usage scGate for de-novo isolation of any cellular subset of interest. We will work with a mouse dataset Gubin 2018, Yost.2019

R Environment

Using Renv

if (!requireNamespace("renv")) install.packages("renv")
renv::activate()
renv::restore()
library(gridExtra)
library(ggplot2)
library(reshape2)
library(GEOquery)
library(patchwork)
library(Seurat)
library(cowplot)

Or alternatively, manually installing scGate from GitHub

if (!requireNamespace("remotes")) install.packages("remotes")
remotes::install_github("carmonalab/UCell")
remotes::install_github("carmonalab/scGate")
library(scGate)

if (!requireNamespace("GEOquery")){ 
  install.packages("BiocManager")
  BiocManager::install("GEOquery")
}

Download and process Gubin mouse dataset

Brief description

geo_acc <- "GSE119352"
do_download <- T

gse <- getGEO(geo_acc)
datadir <- "input/Gubin"
dir.create(datadir)

series <- paste0(geo_acc, "_series_matrix.txt.gz")
if (do_download) {
    getGEOSuppFiles(geo_acc, baseDir = datadir)
    system(paste0("gunzip ", datadir, "/", geo_acc, "/*meta_data.tsv.gz"))

    for (acc in gse[[series]]$geo_accession) {
        dir.create(file.path(datadir, acc))
        getGEOSuppFiles(acc, baseDir = datadir)  #untar...
        fns <- list.files(paste0(datadir, "/", acc, "/"))
        for (fn in fns) {
            # rename to basic names: matrix, genes, barcodes
            system(paste0("mv ", paste0(datadir, "/", acc, "/", fn), " ", paste0(datadir,
                "/", acc, "/", sub(".*_", "", fn, perl = T))))
        }
        system(paste0("gunzip ", datadir, "/", acc, "/*gz"))

    }
}

# as.character(gse[[1]]$title[gse[[1]]$geo_accession==acc])
geoAccToName <- gse[[1]]$title
names(geoAccToName) <- gse[[1]]$geo_accession

Read the sparse count matrices

data.list <- list()

for (i in 1:length(gse[[series]]$geo_accession)) {
    acc = gse[[series]]$geo_accession[i]
    fname <- paste0(datadir, "/", acc)
    data <- Read10X(fname)
    title <- gse[[1]]$title[i]

    c <- gsub(pattern = "(\\S+)", replacement = paste0("GU-\\1-", i), colnames(data),
        perl = TRUE)
    colnames(data) <- c

    tmp.seurat <- CreateSeuratObject(counts = data, project = "Gubin", min.cells = 3,
        min.features = 50)
    tmp.seurat@meta.data$Sample <- acc
    tmp.seurat@meta.data$Condition <- title

    data.list[[title]] <- tmp.seurat
    rm(tmp.seurat)
}

names(data.list)

data.list[[1]]$Treatment <- "Control"
data.list[[2]]$Treatment <- "aPD1"
data.list[[3]]$Treatment <- "aCTLA4"
data.list[[4]]$Treatment <- "aPD1_aCTLA4"

data.seurat.raw <- Reduce(merge, data.list)

Data Preprocessing

# basic information of generated seurat object
data.seurat.raw
An object of class Seurat 
14490 features across 14618 samples within 1 assay 
Active assay: RNA (14490 features, 0 variable features)

# Condition composition
table(data.seurat.raw$Condition)

                        CD45+ cells from tumor microenvironment of aCTLA4 treated mice 
                                                                                  3372 
               CD45+ cells from tumor microenvironment of aPD1 and aCTLA4 treated mice 
                                                                                  3966 
                          CD45+ cells from tumor microenvironment of aPD1 treated mice 
                                                                                  3440 
CD45+ cells from tumor microenvironment of IgG2a isotype control antibody treated mice 
                                                                                  3840 

# Sample composition
table(data.seurat.raw$Sample)

GSM3371684 GSM3371685 GSM3371686 GSM3371687 
      3840       3440       3372       3966 

# Treatment composition
table(data.seurat.raw$Treatment)

     aCTLA4        aPD1 aPD1_aCTLA4     Control 
       3372        3440        3966        3840 

Ribosomal and mitochondrial content

percent.ribo.dv <- PercentageFeatureSet(data.seurat.raw, pattern = "^Rp[ls]")
percent.mito.dv <- PercentageFeatureSet(data.seurat.raw, pattern = "^mt-")

data.seurat.raw <- AddMetaData(data.seurat.raw, metadata = percent.ribo.dv, col.name = "percent.ribo")
data.seurat.raw <- AddMetaData(data.seurat.raw, metadata = percent.mito.dv, col.name = "percent.mito")

Look at distributions per sample

pl <- VlnPlot(data.seurat.raw, features = c("nFeature_RNA", "nCount_RNA", "percent.ribo",
    "percent.mito"), ncol = 4, pt.size = 0, combine = FALSE, group.by = "Treatment")

pll <- list()
for (i in seq_along(pl)) {
    pll[[i]] = pl[[i]] + theme(axis.text.x = element_blank(), legend.position = "none")
}

plot_grid(plotlist = pll, ncol = 4)

## nFeatures
summary(data.seurat.raw@meta.data$nFeature_RNA)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    166     770    1088    1178    1468    6972 

## nCounts
summary(data.seurat.raw@meta.data$nCount_RNA)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    792    1700    2834    3432    4414   78677 

# % ribo
summary(data.seurat.raw@meta.data$percent.ribo)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.2381 11.6778 16.2097 18.1720 24.1830 65.9930 

# % mito
summary(data.seurat.raw@meta.data$percent.mito)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.6066  0.8999  1.0833  1.2931 92.1209 

Basic quality control

# before filtering
sprintf("%s cells before filtering", dim(data.seurat.raw)[2])
[1] "14618 cells before filtering"

data.seurat <- subset(data.seurat.raw, subset = nFeature_RNA > 500 & nFeature_RNA <
    4000 & nCount_RNA > 1000 & nCount_RNA < 15000 & percent.ribo < 50 & percent.mito <
    10)

# after filtering
sprintf("%s cells after filtering", dim(data.seurat)[2])
[1] "13690 cells after filtering"

Data normalization and unsupervised analysis

ndim = 30
set.seed(1234)

data.seurat <- NormalizeData(data.seurat, verbose = FALSE)
data.seurat <- FindVariableFeatures(data.seurat, selection.method = "vst", nfeatures = 1000,
    verbose = FALSE)
bk.list <- unlist(scGate::genes.blacklist.Mm)

data.seurat@assays$RNA@var.features <- setdiff(data.seurat@assays$RNA@var.features,
    bk.list)

data.seurat <- ScaleData(data.seurat)
data.seurat <- RunPCA(data.seurat, features = data.seurat@assays$RNA@var.features,
    ndims.print = 1:5, nfeatures.print = 5, npcs = ndim)

## Run Umap
data.seurat <- RunUMAP(data.seurat, reduction = "pca", dims = 1:ndim, seed.use = 123)

2D visualization using UMAP

DimPlot(data.seurat, reduction = "umap", group.by = "Treatment") + ggtitle("UMAP by sample")

Plot some clasical markers

features = c("Ptprc", "Lck", "Csf1r", "Cd3e", "Foxp3", "Cd8a", "Cd4", "H2-Eb1", "H2-Aa",
    "Cd300e", "Itgal", "C1qa", "C1qb", "Gpnmb", "Fabp5", "Saa3", "Clec10a", "Irf8",
    "Ccr7", "S100a9", "Plac8", "Xcr1", "Spi1", "Siglech", "Ccr9", "nCount_RNA", "nFeature_RNA")

plt.classical.mkr <- FeaturePlot(data.seurat, reduction = "umap", features = features,
    order = T, ncol = 4)

plot(plt.classical.mkr)

Display some selected markers

features = c("Lck", "Csf1r", "Spi1", "Cd3e", "H2-Eb1", "Ccr7", "S100a9", "Xcr1",
    "Siglech")

plt.selected.mkr <- FeaturePlot(data.seurat, reduction = "umap", features = features,
    order = T, ncol = 3, max.cutoff = "q99")
plot(plt.selected.mkr)

Export full dataset

dir.create("aux")
saveRDS(data.seurat, file = "aux/Gubin_CD45_clean.rds")

scGate for isolating cell subsets

Filter on T cells

ffile = "./aux/tcell.gubin.rds"
if (file.exists(ffile)) {
    gate = F
    data.T <- readRDS(ffile)
} else {
    gate = T
}
data.T <- scGate(data.seurat, gating.model = scGate_DB$mouse$Tcell, sd.out = 5)
saveRDS(data.T, ffile)
plt.ispure <- DimPlot(data.T, group.by = "is.pure") + theme(aspect.ratio = 1)
plt.ann <- DimPlot(data.T, group.by = "scGate.annotation") + theme(aspect.ratio = 1)

features = c("Lck", "Cd3e", "Fcer1g")
lymphoid <- FeaturePlot(data.T, reduction = "umap", ncol = 3, features = features,
    order = T, max.cutoff = "q99") + theme(aspect.ratio = 1)

layout <- "
AAAAAAA
BBB#CCC"

plt <- wrap_plots(A = lymphoid, B = plt.ispure, C = plt.ann, design = layout)
plt

Filter myeloid APCs

scGate APC model

ffile = "./aux/apc.gubin.rds"
if (file.exists(ffile)) {
    gate = F
    data.myel.scgate <- readRDS(ffile)
} else {
    gate = T
}
data.myel.scgate <- scGate(data.seurat, gating.model = scGate_DB$mouse$Monocyte_Macrophage_Dendritic_pDendritic,
    sd.in = 3, sd.out = 7)
saveRDS(data.myel.scgate, ffile)
plt.ispure <- DimPlot(data.myel.scgate, group.by = "is.pure")
plt.ann <- DimPlot(data.myel.scgate, group.by = "scGate.annotation")

features = c("Csf1r", "Spi1", "H2-Eb1", "S100a9")
apcs <- FeaturePlot(data.myel.scgate, reduction = "umap", ncol = 4, features = features,
    order = T, max.cutoff = "q99")

layout <- "
AAAAA
BB#CC"
plt <- wrap_plots(A = apcs, B = plt.ispure, C = plt.ann, design = layout)
plt

Project into myeloid atlas Load atlas

#Project Gubin data into reference myeloid atlas